# -*- coding: utf-8 -*-
# @Time : 2021/10/20 22:54 
# @Author : TanDaBao
# @File : zhanzhang_thumbnail.py
'''
爬取站长素材高清大图分类中的主页缩略图
'''

from lxml import etree
import requests,os

# 创建存储图片目录
dirName = 'ImgLibs'
if not os.path.exists(dirName):
    os.mkdir(dirName)

url = 'https://sc.chinaz.com/tupian/rentiyishu_%d.html'
for page in range(1,6):     # 爬取前5页
    if page == 1:
        new_url = 'https://sc.chinaz.com/tupian/rentiyishu.html'
    else:
        new_url = url%page

    proxies = {"http": None, "https": None}
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
    }
    # 使用requests的encoding属性避免中文乱码，utf-8不行就gbk
    response = requests.get(url=new_url, proxies=proxies, headers=headers)
    response.encoding = 'utf-8'
    html = response.text

    tree = etree.HTML(html)
    location = tree.xpath('//*[@id="container"]/div/div/a')

    for son in location:
        title = son.xpath('./img/@alt')
        link = son.xpath('./img/@src2')     # 该页面使用了图片懒加载，伪属性是src2而不是src
        detailed_url = 'https:' + link[0]
        res = requests.get(url=detailed_url, proxies=proxies, headers=headers).content
        imgPath = dirName + '/' + title[0] + '.jpg'
        with open(imgPath, 'wb') as f:
            f.write(res)
        print(title[0], '下载完成')